In this vignette, we demonstrate how to use DAVINCHI to
To run this vignette, we first load the package.
library(DAVINCHI)
library(data.table)
Here we use the two mouse brain slices as the working example, and we start by loading the first slice (V1).
gene.exp <- fread("~/functional/Globus/RongLab/MouseBrain/V1transcriptome/whole.tsv", header = T, sep = "\t")
row.names <- as.character(gene.exp$V1)
gene.exp <- as.data.frame(gene.exp[,2:ncol(gene.exp)])
rownames(gene.exp) <- row.names
gene.exp <- t(gene.exp)
t1 <- apply(gene.exp,2,function(x){sum(x!=0)})
gene.exp.V1 <- gene.exp[,which(t1!=0)]
#get the coordinates
col.names <- colnames(gene.exp.V1)
x <- as.numeric(unlist(lapply(strsplit(col.names, "x"), function(x){x[1]})))
y <- as.numeric(unlist(lapply(strsplit(col.names, "x"), function(x){x[2]})))
coor.V1 <- cbind(x, y)
colnames(coor.V1) <- c("array_row", "array_col")
rownames(coor.V1) <- col.names
#gene.exp.V1, coor.V1
#add the atac information
input.atac.V1 <- readRDS("~/Desktop/KPMP/code/PLIER_Ahmed/speed_sylvester/MultiModality/SpatialCoAssay/RongLab/input/MouseBrain_V2/ATAC/Mouse_brain_V2_spatial_ATAC_peak_matrix.rds")
#save.label <- "peak"
barcodes <- unlist(lapply(strsplit(colnames(input.atac.V1), "#"), function(x){x[2]}))
barcodes <- gsub("-1", "", barcodes)
meta <- readRDS("~/Desktop/CHARM/AIM/mouse_spleen_ATAC_JUL2023/DBiTseq_2500.coor.RDS")
meta <- meta[match(barcodes, rownames(meta)),]
coor.atac.V1 <- meta[,c("col", "row")]
coor.atac.V1 <- coor.atac.V1+1
#table(unlist(lapply(1:nrow(meta), function(x){grepl(rownames(meta)[x], colnames(input.atac.V1)[x])})))
rownames(coor.atac.V1) <- paste0(coor.atac.V1[,1], "x", coor.atac.V1[,2])
colnames(coor.atac.V1) <- c("array_row", "array_col")
colnames(input.atac.V1) <- rownames(coor.atac.V1)
#coor.atac.V1
input.atac.V1 <- input.atac.V1[,rownames(coor.V1)]
coor.atac.V1 <- coor.atac.V1[rownames(coor.V1),]
Then we load the second slice (V2) of mouse brain samples.
input <- readRDS("~/functional/Globus/RongLab/MouseBrain/V2transcriptome/mbrnamb2_pca20_res1_dims20.rds")
gene.exp.V2 <- input@assays$Spatial@counts
#get the coordinates
col.names <- colnames(gene.exp.V2)
x <- as.numeric(unlist(lapply(strsplit(col.names, "x"), function(x){x[1]})))
y <- as.numeric(unlist(lapply(strsplit(col.names, "x"), function(x){x[2]})))
coor.V2 <- cbind(x, y)
colnames(coor.V2) <- c("array_row", "array_col")
rownames(coor.V2) <- col.names
#gene.exp.V2, coor.V2
#add the atac information
input.atac.V2 <- readRDS("~/Desktop/KPMP/code/PLIER_Ahmed/speed_sylvester/MultiModality/SpatialCoAssay/RongLab/input/MouseBrain_V2/ATAC/Mouse_brain_V2_spatial_ATAC_peak_matrix.rds")
barcodes <- unlist(lapply(strsplit(colnames(input.atac.V2), "#"), function(x){x[2]}))
barcodes <- gsub("-1", "", barcodes)
meta <- readRDS("~/Desktop/CHARM/AIM/mouse_spleen_ATAC_JUL2023/DBiTseq_2500.coor.RDS")
meta <- meta[match(barcodes, rownames(meta)),]
coor.atac.V2 <- meta[,c("col", "row")]
coor.atac.V2 <- coor.atac.V2+1
#table(unlist(lapply(1:nrow(meta), function(x){grepl(rownames(meta)[x], colnames(input.atac.V2)[x])})))
rownames(coor.atac.V2) <- paste0(coor.atac.V2[,1], "x", coor.atac.V2[,2])
colnames(coor.atac.V2) <- c("array_row", "array_col")
colnames(input.atac.V2) <- rownames(coor.atac.V2)
#coor.atac.V2
input.atac.V2 <- input.atac.V2[,rownames(coor.V2)]
coor.atac.V2 <- coor.atac.V2[rownames(coor.V2),]
The general strategy to conduct multi-modal multi-sample integration is to first finish horizontal integration (sample-wise integration for each modality), and then proceed with vertical integration (across modality).
We preprocess gene expression data using a simple and standard
pipeline. The utility function is named as preprocess which
provides flexible options to preprocess the spatial omics data.
fin.V1 <- DAVINCHI::preprocess(gene.exp.V1, coor.V1, type = "rna", graph.opt = "Tri.mesh", frac.thr = 0.95)
fin.V2 <- DAVINCHI::preprocess(gene.exp.V2, coor.V2, type = "rna", graph.opt = "Tri.mesh", frac.thr = 0.95)
To initiate the horizontal integration using
Horizontal.Integration, we need to construct three list
objects,
Y.list list of gene expression matrixL.list list of graph laplaciancoor.list list of spot coordinatesHorizontal.Integration is the core function to perform
horizontal integration with Y.list, L.list and
coor.list as input. k.args encodes the number
of latent variables corresponding to each sample. k=12 is
generally a good choice but you can refer to the other tutorial on how
to optimize this parameter.
Y.list <- list(fin.V1$mat, fin.V2$mat)
L.list <- list(fin.V1$L, fin.V2$L)
coor.list <- list(fin.V1$coor, fin.V2$coor)
ptm <- proc.time()
integration.res <- Horizontal.Integration(Y.list, L.list, coor.list, k.args = c(12,12), LVs.filter.thr = 0.8, mod = "all", remove.LV1 = T)
## [1] "L1 is set to 33.8871316128288"
## [1] "L2 is set to 101.661394838487"
## 50
## [1] "L1 is set to 31.4599804538765"
## [1] "L2 is set to 94.3799413616295"
## 100 50 100
## 200 100 200
## 150 100 200
## 150 100 200
## [1] "L1 is set to 31.4599804538765"
## [1] "L2 is set to 94.3799413616295"
## [1] "L1 is set to 33.8871316128288"
## [1] "L2 is set to 101.661394838487"
print(proc.time()-ptm)
## user system elapsed
## 616.858 10.265 628.630
Before completing the holistic integration, we can review the
clusters based on the RNA-seq integation results. We provide a wrapper
function swk to cluster the integrated latent variables and
get the spatial niche patterns. Here we used model-based clustering
approaches and set the number of clusters as 8. We visualize the spatial
clusters using another wrapper function
scatter.DiscretePlot.
cluster.RNA <- swk(integration.res$LVs_embeddings, method = "mclust", L2norm = T, mclust.model = "EEE", mclust.num = 8, random.seed = 1)
## [1] "Number of unique clusters is 8"
p.V1 <- scatter.DiscretePlot(coor.V1, as.character(cluster.RNA)[integration.res$slice_id=="1"], pt.size = 2)
p.V2 <- scatter.DiscretePlot(coor.V2, as.character(cluster.RNA)[integration.res$slice_id=="2"], pt.size = 2)
ggarrange(p.V1, p.V2)
We preprocess ATAC-seq data using a simple and standard pipeline. The
utility function is named as preprocess which provides
flexible options to preprocess the spatial omics data in general.
LSI normalization is also available as one of the
choices.
fin.atac.V1 <- DAVINCHI::preprocess(input.atac.V1, coor.V1, type = "rna", graph.opt = "Tri.mesh", frac.thr = 0.95)
fin.atac.V2 <- DAVINCHI::preprocess(input.atac.V2, coor.V2, type = "rna", graph.opt = "Tri.mesh", frac.thr = 0.95)
We first construct three objects Y.list,
L.list and coor.list as input to the core
function Horizontal.Integration.
Y.list <- list(fin.atac.V1$mat, fin.atac.V2$mat)
L.list <- list(fin.atac.V1$L, fin.atac.V2$L)
coor.list <- list(fin.atac.V1$coor, fin.atac.V2$coor)
ptm <- proc.time()
integration.atac.res <- Horizontal.Integration(Y.list, L.list, coor.list, k.args = c(12,12), LVs.filter.thr = 0.8, mod = "all", remove.LV1 = T)
## [1] "L1 is set to 14.9287924479"
## [1] "L2 is set to 44.7863773436999"
## 25 25 50
## 37.5 25 50
## 31.25 25 37.5
## 31.25 25 37.5
## [1] "L1 is set to 14.9287924479"
## [1] "L2 is set to 44.7863773436999"
## 25 25 50
## 37.5 25 50
## 31.25 25 37.5
## 31.25 25 37.5
## [1] "L1 is set to 14.9287924479"
## [1] "L2 is set to 44.7863773436999"
## [1] "L1 is set to 14.9287924479"
## [1] "L2 is set to 44.7863773436999"
print(proc.time()-ptm)
## user system elapsed
## 567.754 12.230 581.414
Here we use BiModalIntegration to integrate the spatal
RNA and spatial ATAC, and the input are the latent variables from each
modality.
ptm <- proc.time()
vertical.res <- BiModalIntegration(modal.1 = integration.res$LVs_embeddings, modal.2 = integration.atac.res$LVs_embeddings, n_neighbors = 20)
print(proc.time()-ptm)
## user system elapsed
## 19.899 0.249 20.936
The output from BiModalIntegration includes the modality
weights specifying the weights for integration per spot. We can use
scatter.FeaturePlot again to visualize the weights by
position for all slices of all modalities.
p.rna.weight1 <- scatter.FeaturePlot(coor.V1, LVs = vertical.res$weight.1[grepl("1_", names(vertical.res$weight.1))], plot.all = F, pt.size = 1)+ggtitle("RNA weights, V1")
p.atac.weight1 <- scatter.FeaturePlot(coor.V1, LVs = vertical.res$weight.2[grepl("1_", names(vertical.res$weight.2))], plot.all = F, pt.size = 1)+ggtitle("ATAC weights, V1")
p.rna.weight2 <- scatter.FeaturePlot(coor.V2, LVs = vertical.res$weight.1[grepl("2_", names(vertical.res$weight.1))], plot.all = F, pt.size = 1)+ggtitle("RNA weights, V2")
p.atac.weight2 <- scatter.FeaturePlot(coor.V2, LVs = vertical.res$weight.2[grepl("2_", names(vertical.res$weight.2))], plot.all = F, pt.size = 1)+ggtitle("ATAC weights, V2")
ggarrange(plotlist = list(p.rna.weight1, p.atac.weight1, p.rna.weight2, p.atac.weight2), nrow = 2, ncol = 2)
As we integrate both modalities, we can use the integrative latent
variables for clustering to achieve a more comprehensive niche
assignment. integration.res$snn.mat already encodes the
shared nearest neighborhoood graph, which can be fed into community
clustering algorithm. Here we are using the Seurat implementation of
Leiden algorithm because of its efficiency, but you can use your
preferred network-based algorithm instead to cluser, such as
leiden::leiden.
ptm <- proc.time()
cluster.integrate <- Seurat:::RunLeiden(vertical.res$snn.mat, resolution.parameter = 0.6, method = "matrix")
#cluster.integrate <- Seurat:::RunModularityClustering(integration.res$snn.mat, resolution = 0.6, algorithm = 1)
print(proc.time()-ptm)
## user system elapsed
## 12.672 5.282 16.697
We use scatter.DiscretePlot to visualize the intergative
niches per slice.
p.V1 <- scatter.DiscretePlot(coor.V1, as.character(cluster.integrate)[which(grepl("1_", names(vertical.res$weight.1)))], pt.size = 1)
p.V2 <- scatter.DiscretePlot(coor.V2, as.character(cluster.integrate)[which(grepl("2_", names(vertical.res$weight.1)))], pt.size = 1)
ggarrange(p.V1, p.V2)
After obtaining the integrated niches, we can asses any bias of modality weights per niche to understand how each modality contributes to the niche determination. In this specific example, RNA plays a universally more dominant role compared to ATAC.
weight <- c(vertical.res$weight.1, vertical.res$weight.2)
grp <- c(as.character(cluster.integrate), as.character(cluster.integrate))
modal.name <- c(rep("RNA", length(cluster.integrate)), rep("ATAC", length(cluster.integrate) ))
slice <- rep(unlist(lapply(strsplit(names(vertical.res$weight.1), "_"), function(x){x[1]})), times = 2)
dat.to_plot <- data.frame(weight = weight, grp = grp, modal.name = modal.name, slice = slice)
p.V1 <- ggplot(dat.to_plot[dat.to_plot$slice == "1",], aes(x=grp, y = weight, fill = modal.name))+geom_violin()+theme_pubr(base_size = 20)+xlab("Clusters")+ylab("Modality.weights")+stat_compare_means(method = "wilcox.test", size = 3.5, paired = T)
p.V2 <- ggplot(dat.to_plot[dat.to_plot$slice == "2",], aes(x=grp, y = weight, fill = modal.name))+geom_violin()+theme_pubr(base_size = 20)+xlab("Clusters")+ylab("Modality.weights")+stat_compare_means(method = "wilcox.test", size = 3.5, paired = T)
ggarrange(p.V1, p.V2, nrow = 2)